


load("Recall_week1.RData")
load("Registration_recalls_merged.RData")


#RULE 1: ALL CARS IN RECALL DATA ARE ALSO INTO MAIN DATA
#I.E. DELETE MOTORBIKES, VAN, TRUCKS OR SPECIAL PASSENGER CARS (CAMPERS, AMBULANCES ETC)
table(Recallw$kenteken %in% Main$kenteken)
Recallw<-subset(Recallw, kenteken %in% Main$kenteken)


#RULE 2: REMOVE EXPORTED IF BEFORE 11 2017 (EITHER BY DATE OR IF EXPORT INDICATOR ALWAYS YES)
i<-(is.na(Main$exported_last) & Main$exportindicator=="Nee") | str_detect(Main$exported_last, "^2019") | str_detect(Main$exported_last, "^2018") | str_detect(Main$exported_last, "^2017-11") | str_detect(Main$exported_last, "^2017-12")
table(i)/nrow(Main)
#About 3.2% of current sample
Main<-subset(Main, (is.na(Main$exported_last) & Main$exportindicator=="Nee") | str_detect(exported_last, "^2019") | str_detect(exported_last, "^2018") | str_detect(exported_last, "^2017-11") | str_detect(exported_last, "^2017-12") )
Recallw<-subset(Recallw, kenteken %in% Main$kenteken)   
rm(i)

#RULE 3: KEEP ONLY CARS RECEIVING ZERO OR ONE RECALL ONLY
i<-str_count(Main$rec_codes, ";")==1 | is.na(Main$rec_codes)
table(i)
table(i)/nrow(Main)
#Drop 7.5% of sample, or about 584250 cars
Main<-subset(Main, str_count(rec_codes, ";")==1 | is.na(rec_codes) )
Recallw<-subset(Recallw, kenteken %in% Main$kenteken)   
rm(i)

#RULE 4: REMOVE CARS STARTING A RECALL ALREADY CLOSED, NOT RECORDED IN MAIN DATASET AND NOT EXPORTED
Temp<-subset(Recallw, is.na(recall_new)==FALSE & status_first=="P")
Temp1<-subset(Main, kenteken %in% Temp$kenteken & is.na(exported_last) & is.na(recall_new) )
nrow(Temp1)/nrow(Main) 
#Small amount of cars removed
Main<-subset(Main, !c(kenteken %in% Temp1$kenteken) )
Recallw<-subset(Recallw, kenteken %in% Main$kenteken)  
rm(Temp, Temp1)

#RULE 5: REMOVE SCRAPPED/STOLEN CARS
Main<-subset(Main, is.na(scrappage_date) )
Recallw<-subset(Recallw, kenteken %in% Main$kenteken)   

#Drop weird car (2 times same recall)
Main<-subset(Main, kenteken!="KS932Z")
Recallw<-subset(Recallw, kenteken!="KS932Z")

#Remove recall MGP100088 (Note said: "This promotion was already organized by the producer in 2010. The owners of remaining vehicles are still approached by the producer.")
#Lots of cars in this recall appear in the dataset as already fixed
i<-subset(Recallw$kenteken, Recallw$referentiecoderdw=="MGP100088") 
Main<-subset(Main, !c(kenteken %in% i))
Recallw<-subset(Recallw, !c(kenteken %in% i))

#Smooth recall dates using monthly changes in registration status

Temp<-subset(Main, is.na(Main$recall_new)==FALSE | is.na(Main$recall_fixed)==FALSE, select=c(kenteken, recall_new, recall_fixed))
Temp1<-merge(Recallw, Temp, by="kenteken", all.x=TRUE, all.y=FALSE)
Temp1$recall_new.y<-gsub("NA;", "", Temp1$recall_new.y)
#Remove one month because actual status change occurred the month earlier
Temp1$recall_new.y<-as.Date(Temp1$recall_new.y, format="%d/%m/%Y") %m-% months(1)
#Get date roughly in the middle of the month
Temp1$recall_new.y<-as.Date(Temp1$recall_new.y) + days(11)
Temp1$recall_fixed.y<-gsub("NA;", "", Temp1$recall_fixed.y)
Temp1$recall_fixed.y<-as.Date(Temp1$recall_fixed.y, format="%d/%m/%Y") %m-% months(1) 
Temp1$recall_fixed.y<-as.Date(Temp1$recall_fixed.y) + days(11)  

i<-is.na(Temp1$recall_new.x)==FALSE & Temp1$recall_new.x=="2018-12-28"
Temp1$recall_new.x[i]<-Temp1$recall_new.y[i]
#All recalls start as open
Temp1$status_first[i]<-"O"
rm(i)

i<-is.na(Temp1$recall_new.x)==FALSE & Temp1$recall_new.x=="2019-02-15"
Temp1$recall_new.x[i]<-Temp1$recall_new.y[i]
Temp1$status_first[i]<-"O"
rm(i)

i<-is.na(Temp1$recall_fixed.x)==FALSE & Temp1$recall_fixed.x=="2018-12-28"
Temp1$recall_fixed.x[i]<-Temp1$recall_fixed.y[i]
rm(i)

i<-is.na(Temp1$recall_fixed.x)==FALSE & Temp1$recall_fixed.x=="2019-02-15"
Temp1$recall_fixed.x[i]<-Temp1$recall_fixed.y[i]
rm(i)

#Lots of recalls in Sep 2018-Feb 2019 period

#If car specific recall date not available, get dates from recall registry
load("Terugroep_actie_6 april 2019.RData")
#Drop dots from variable names
names(x)<-gsub("\\.", "",names(x))
#All lower case
names(x)<-tolower(names(x))
#Transform factor in character
i <- sapply(x, is.factor)
x[i] <- lapply(x[i], as.character)
x<-subset(x, select=c(referentiecoderdw, publicatiedatumrdw))
Temp1<-merge(Temp1, x, by="referentiecoderdw", all.x=TRUE, all.y=FALSE)
Temp1$publicatiedatumrdw<-as.Date(as.character(Temp1$publicatiedatumrdw), format="%Y%m%d",origin="1970-01-01")

#Get export indicator
Export<-subset(Main, is.na(exported_last)==FALSE, select=c(kenteken, exported_last))
Temp1<-merge(Temp1, Export, by="kenteken", all.x=TRUE, all.y=FALSE)

Temp1$exported_before_newr<-0
i<-is.na(Temp1$exported_last)==FALSE & ((is.na(Temp1$recall_new.x)==FALSE & Temp1$recall_new.x>Temp1$exported_last)|
  (is.na(Temp1$recall_new.x)==TRUE & Temp1$publicatiedatumrdw>Temp1$exported_last))
Temp1$exported_before_newr[i]<-1
rm(i)

Temp1$exported_before_fixed<-0
i<-is.na(Temp1$exported_last)==FALSE & is.na(Temp1$recall_fixed.x)==FALSE & Temp1$recall_fixed.x>Temp1$exported_last
Temp1$exported_before_fixed[i]<-1
rm(i)

Temp3<-subset(Temp1, is.na(recall_new.x)==FALSE & exported_before_newr==0 & exported_before_fixed==0)
Temp3$diff<-Temp3$recall_new.x-Temp3$publicatiedatumrdw
quantile(Temp3$diff, probs=seq(0,1,0.025), na.rm=TRUE)
#Pretty good correspondance between individual recall start and publication date
#Recalls with large discrepancies:
#MGP170644: Rolling, but all starting with open recalls
#MGP170645: Same starting period for everyone, all starting with open recalls
#MGP180384: Some cars start later, but all starting with open recalls (except exported)
#MGP170289: All starting the same period, all starting open
#MGP160298: All starting the same period, all starting open
#MGP160354: All starting the same period, all starting open
#MGP160062: All starting the same period, all starting open
#MGP150305: Rolling, but all starting with open recalls
#MGP150307: All starting more or less in the same period, all starting open
#MGP160181

#When individual recall date not available (i.e. before sep/nov 2017), then used publication date
i<-is.na(Temp1$recall_new.x)
Temp1$recall_new.x[i]<-Temp1$publicatiedatumrdw[i]
rm(i)

#When fixing date in recall database doesn't match with registration database?
Temp6<-subset(Temp1, is.na(recall_fixed.x)==FALSE & is.na(recall_fixed.y)==FALSE)
Temp6$diff<-Temp6$recall_fixed.x-Temp6$recall_fixed.y
quantile(Temp6$diff, probs=seq(0,1,0.025))
quantile(Temp6$diff[Temp6$missing==0], probs=seq(0,1,0.025))
#Not many: explanation: missing cars in recall database in some parts
#If so: substitute with date in fixed database
i<-Temp1$missing==1
Temp1$recall_fixed.x[i]<-Temp1$recall_fixed.y[i]
rm(i)

#Recall dataset ready now

#Intergrate with Main
Temp1$recall_new<-Temp1$recall_new.x
Temp1$recall_fixed<-Temp1$recall_fixed.x

Temp1$recall_new.x<-NULL
Temp1$recall_new.y<-NULL
Temp1$recall_fixed.x<-NULL
Temp1$recall_fixed.y<-NULL
Temp1$exported_before_newr<-NULL
Temp1$exported_before_fixed<-NULL


rm(Temp, Temp3, Temp6, Export)

Sales<-subset(Main, select=c(kenteken, datumtenaamstelling_all, datumeersteafgiftenederland, datumeerstetoelating, typegoedkeuringsnummer, variant, uitvoering))

Data<-merge(Sales, Temp1, by="kenteken", all.x=TRUE, all.y=TRUE)

Data$datumeersteafgiftenederland<-as.Date(as.character(Data$datumeersteafgiftenederland), format="%d/%m/%Y",origin="1970-01-01")
Data$datumeerstetoelating<-as.Date(as.character(Data$datumeerstetoelating), format="%d/%m/%Y",origin="1970-01-01")

save(Data, file="Recalls_merged_2019.RData")



